{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Project Gutenber Crawler\n",
    "\n",
    "Make sure you read the site's TOS and the notebook's README.md on how to use the crawler."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/gutenberg/project_gutenberg_crawler.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment and run below lines to set up if running in colab\n",
    "# !git clone https://github.com/LAION-AI/Open-Assistant.git\n",
    "# %cd Open-Assistant/notebooks/gutenberg\n",
    "# !pip install -r requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global settings\n",
    "\n",
    "LANG = (\n",
    "    \"en\"  # crawl english language books, NOTE: there are a few houndred books with multiple languages such as 'en; es'\n",
    ")\n",
    "FOLDER = \"text\"  # save metadata and body of text to this folder\n",
    "CHUNKS = 1  # optionally divide the dataset into this many compressed parquet files if you have less memory\n",
    "STATUS = \"crawled.csv\"  # save the list of downloaded files and their status into this csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import required packages\n",
    "import os\n",
    "import io\n",
    "import re\n",
    "import requests\n",
    "import time\n",
    "import warnings\n",
    "\n",
    "try:\n",
    "    from BeautifulSoup import BeautifulSoup\n",
    "except ImportError:\n",
    "    from bs4 import BeautifulSoup\n",
    "from tqdm import tqdm\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from typing import Tuple, Optional, Any"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Code for crawler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GutenbergCrawler:\n",
    "    HEADER = {\n",
    "        \"User-Agent\": \"Mozilla/5.0 (compatible; GutenbergCrawler/0.1)\",\n",
    "    }\n",
    "    TIMER = 600  # wait ms between calls\n",
    "    MIRRORS = [\n",
    "        \"http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/\",\n",
    "        \"https://www.gutenberg.org/dirs/\",\n",
    "        \"http://mirrors.xmission.com/gutenberg/\",\n",
    "    ]  # see https://www.gutenberg.org/MIRRORS.ALL for available mirrors\n",
    "\n",
    "    def __init__(self, folder: Optional[str] = None) -> None:\n",
    "        self.folder = folder\n",
    "        if self.folder is not None:\n",
    "            os.makedirs(self.folder, exist_ok=True)\n",
    "        self.calls = 0\n",
    "        self.last_call = 0\n",
    "\n",
    "    def _get(self, url: str) -> str:\n",
    "        self.calls += 1\n",
    "        diff = max(0.0, self.TIMER - (time.time() - self.last_call))\n",
    "        if diff:\n",
    "            time.sleep(diff / 1000.0)\n",
    "        data = requests.get(url, headers=self.HEADER)\n",
    "        self.last_call = time.time()\n",
    "        if data.status_code == 404:\n",
    "            return None\n",
    "        try:\n",
    "            return data.content.decode(\"utf-8\")\n",
    "        except UnicodeDecodeError:\n",
    "            try:\n",
    "                return data.content.decode(\"ISO-8859-1\")  # latin-1\n",
    "            except UnicodeDecodeError:\n",
    "                return data.content.decode(\"utf-8\", \"backslashreplace\")\n",
    "\n",
    "    def catalog(self) -> pd.DataFrame:\n",
    "        try:\n",
    "            csv = pd.read_csv(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz\", sep=\",\")\n",
    "        except Exception:\n",
    "            raw = self._get(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv\")\n",
    "            if raw is None:\n",
    "                raise ValueError(\"Catalog CSV file does not exist!\")\n",
    "            csv = pd.read_csv(io.StringIO(raw), sep=\",\")\n",
    "        return csv.loc[csv[\"Type\"] == \"Text\"].reset_index(drop=True)\n",
    "\n",
    "    def search(self, url: str) -> dict:\n",
    "        \"\"\"Use catalog() instead! Returns dict with book_id: 'book title' pairs for gutenberg.org pages\"\"\"\n",
    "        assert \"/www.gutenberg.org\" in url, \"The URL must be a page at https://www.gutenberg.org/\"\n",
    "        html = self._get(url)\n",
    "        if html is None:\n",
    "            return {}\n",
    "        dom = BeautifulSoup(html, \"html.parser\")\n",
    "        results = {}\n",
    "        for a in dom.find_all(\"a\"):\n",
    "            for elem in re.findall(r\"<a href=\\\"/ebooks/(\\d+)\\\">(.+?)</a>\", str(a)):\n",
    "                ebook, title = elem\n",
    "                results[int(ebook)] = title.replace(\"\\r<br/>\", \"\\r\\n\")\n",
    "        return results\n",
    "\n",
    "    def download(self, book: int) -> Optional[str]:\n",
    "        book = int(book)\n",
    "        assert book > 0\n",
    "        mirror = np.random.choice(self.MIRRORS)\n",
    "        if book < 10:\n",
    "            page = f\"0/{book}/\"\n",
    "        else:\n",
    "            page = \"/\".join([char for char in str(book)[:-1]]) + f\"/{book}/\"\n",
    "        url = f\"{mirror}{page}{book}-h/{book}-h.htm\"\n",
    "        return self._get(url)\n",
    "\n",
    "    def parse(self, book: int, html: str) -> Tuple[Optional[str], Optional[str]]:\n",
    "        book = int(book)\n",
    "        assert book > 0\n",
    "        if html is None:\n",
    "            return None, None\n",
    "        dom = BeautifulSoup(html, \"html.parser\")\n",
    "        if dom is None or dom.title is None or dom.title.string is None or \"404\" in dom.title.string:\n",
    "            return None, None\n",
    "\n",
    "        meta = \"\"\n",
    "        for pre in dom.select(\"title, pre\"):\n",
    "            meta += str(pre.get_text()).strip()\n",
    "            # remove metadata from dom afterwards\n",
    "            pre.extract()\n",
    "        if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", meta):\n",
    "            warnings.warn(f\"Book {book} is copyrighted.\")\n",
    "            return None, None\n",
    "        for img in dom.select(\"img\"):\n",
    "            # add image alt attributes as text\n",
    "            try:\n",
    "                img.insert(0, img[\"alt\"])\n",
    "            except KeyError:\n",
    "                pass\n",
    "        text = str(dom.get_text()).strip()\n",
    "        if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", text):\n",
    "            warnings.warn(f\"Book {book} is copyrighted.\")\n",
    "            return None, None\n",
    "\n",
    "        s = re.split(r\"(?i)\\*{2,}[^\\n]+?project gutenberg[^\\n]+?\\*{2,}\\s*[\\r\\n]+\", text)  # 49843\n",
    "        if len(s) > 1:\n",
    "            if len(s) > 3:\n",
    "                warnings.warn(f\"Book {book} is malformed.\")\n",
    "                return None, None\n",
    "            meta += s[0]\n",
    "            return meta, s[1]\n",
    "        return meta, text\n",
    "\n",
    "    @staticmethod\n",
    "    def pretty(text: Optional[str]) -> str:\n",
    "        if not text:\n",
    "            return \"\"\n",
    "        # attempt to remove transcriber's notes\n",
    "        text = re.sub(r\"(?i)(?:\\[|\\b)transcriber[\\'’]?s? notes?\\s*(?:[^\\xa0\\n].*?\\]?(?:\\r?\\n){1,2})+\", \"\", text)\n",
    "        # attempt to remove e-text info\n",
    "        text = re.sub(\n",
    "            r\"(?i)e-text prepared(?:[^\\xa0]\\(?.+\\)?\\r?\\n{1,3})+(?:\\xa0*\\s*note\\:\\s*(?:.+\\s*\\r{0,2}\\n{1,2}){1,5}\\xa0\\s+)?\",\n",
    "            \"\",\n",
    "            text,\n",
    "        )\n",
    "        # standardize line endings\n",
    "        text = \"\\r\\n\".join(text.splitlines())\n",
    "        text = re.sub(r\"(\\r\\n){3,}\", \"\\r\\n\\r\\n\\r\\n\", text).strip()\n",
    "        return text\n",
    "\n",
    "    def _write(self, file: str, content: str) -> None:\n",
    "        path = os.path.join(self.folder, file) if self.folder is not None else file\n",
    "        with open(path, \"w+\", encoding=\"utf-8\") as f:\n",
    "            f.write(content)\n",
    "\n",
    "    def save(self, book: int) -> bool:\n",
    "        html = self.download(book)\n",
    "        meta, text = self.parse(book, html)\n",
    "        if meta:\n",
    "            self._write(f\"{book}_meta.txt\", meta)\n",
    "        if text:\n",
    "            self._write(f\"{book}_text.txt\", text)\n",
    "        return bool(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Start crawling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "gc = GutenbergCrawler(FOLDER)  # use text/ folder to save files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text#</th>\n",
       "      <th>Type</th>\n",
       "      <th>Issued</th>\n",
       "      <th>Title</th>\n",
       "      <th>Language</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Subjects</th>\n",
       "      <th>LoCC</th>\n",
       "      <th>Bookshelves</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9936</th>\n",
       "      <td>10631</td>\n",
       "      <td>Text</td>\n",
       "      <td>2004-01-01</td>\n",
       "      <td>Halleck's New English Literature</td>\n",
       "      <td>en</td>\n",
       "      <td>Halleck, Reuben Post, 1859-1936</td>\n",
       "      <td>English literature -- History and criticism</td>\n",
       "      <td>PR</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20456</th>\n",
       "      <td>21443</td>\n",
       "      <td>Text</td>\n",
       "      <td>2007-05-15</td>\n",
       "      <td>Vesty of the Basins</td>\n",
       "      <td>en</td>\n",
       "      <td>Greene, Sarah Pratt McLean, 1856-1935</td>\n",
       "      <td>Maine -- Fiction</td>\n",
       "      <td>PS</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35395</th>\n",
       "      <td>36710</td>\n",
       "      <td>Text</td>\n",
       "      <td>2011-07-12</td>\n",
       "      <td>The Black Opal</td>\n",
       "      <td>en</td>\n",
       "      <td>Prichard, Katharine Susannah, 1884-1969</td>\n",
       "      <td>Opal mines and mining -- Australia -- Fiction</td>\n",
       "      <td>PR</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27832</th>\n",
       "      <td>29139</td>\n",
       "      <td>Text</td>\n",
       "      <td>2009-06-17</td>\n",
       "      <td>No Pets Allowed</td>\n",
       "      <td>en</td>\n",
       "      <td>Cummings, Monette, 1914-1999</td>\n",
       "      <td>Science fiction; Short stories</td>\n",
       "      <td>PS</td>\n",
       "      <td>Science Fiction</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67010</th>\n",
       "      <td>68338</td>\n",
       "      <td>Text</td>\n",
       "      <td>2022-06-17</td>\n",
       "      <td>Nick Carter Stories No. 160, October 2, 1915: ...</td>\n",
       "      <td>en</td>\n",
       "      <td>Carter, Nicholas (House name); Lebhar, Bertram...</td>\n",
       "      <td>Popular literature -- Periodicals; Detective a...</td>\n",
       "      <td>PS</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Text#  Type      Issued  \\\n",
       "9936   10631  Text  2004-01-01   \n",
       "20456  21443  Text  2007-05-15   \n",
       "35395  36710  Text  2011-07-12   \n",
       "27832  29139  Text  2009-06-17   \n",
       "67010  68338  Text  2022-06-17   \n",
       "\n",
       "                                                   Title Language  \\\n",
       "9936                    Halleck's New English Literature       en   \n",
       "20456                                Vesty of the Basins       en   \n",
       "35395                                     The Black Opal       en   \n",
       "27832                                    No Pets Allowed       en   \n",
       "67010  Nick Carter Stories No. 160, October 2, 1915: ...       en   \n",
       "\n",
       "                                                 Authors  \\\n",
       "9936                     Halleck, Reuben Post, 1859-1936   \n",
       "20456              Greene, Sarah Pratt McLean, 1856-1935   \n",
       "35395            Prichard, Katharine Susannah, 1884-1969   \n",
       "27832                       Cummings, Monette, 1914-1999   \n",
       "67010  Carter, Nicholas (House name); Lebhar, Bertram...   \n",
       "\n",
       "                                                Subjects LoCC      Bookshelves  \n",
       "9936         English literature -- History and criticism   PR              NaN  \n",
       "20456                                   Maine -- Fiction   PS              NaN  \n",
       "35395      Opal mines and mining -- Australia -- Fiction   PR              NaN  \n",
       "27832                     Science fiction; Short stories   PS  Science Fiction  \n",
       "67010  Popular literature -- Periodicals; Detective a...   PS              NaN  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the catalog of ebooks (only text types will be returned)\n",
    "df = gc.catalog()\n",
    "df = df.loc[df[\"Language\"] == LANG]\n",
    "assert len(df), \"No matching items in catalog!\"\n",
    "df = df.sample(frac=1)  # random shuffle\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 out of 55128 items done.\n"
     ]
    }
   ],
   "source": [
    "if os.path.exists(STATUS):\n",
    "    crawled = pd.read_csv(STATUS)\n",
    "else:\n",
    "    crawled = pd.DataFrame({\"book\": [], \"success\": []})\n",
    "print(f\"{len(crawled)} out of {len(df)} items done.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#39750 To Your Dog and to My Dog (en) ✔️ - 1.626s\n",
      "#22585 —And Devious the Line of Duty (en) ✔️ - 1.634s\n",
      "#40234 The Early Introduction of Bogus Freemasonry in the United States of America and Texas Among Colored Masons (en) ✔️ - 1.634s\n",
      "#64470 Biblical Revision, its duties and conditions\n",
      "A sermon preached in St. Paul's Cathedral at the special evening service, on Sunday, March 13, 1870 (en) ✔️ - 1.402s\n",
      "#66291 The Plagiarist From Rigel IV (en) ✔️ - 1.479s\n",
      "#46746 The Chronicles of Newgate, vol. 2/2 (en) ✔️ - 1.933s\n",
      "#39744 Arne; Early Tales and Sketches\n",
      "Patriots Edition (en) ✔️ - 1.917s\n",
      "#30681 Nanny Merry\n",
      "or, What Made the Difference? (en) ✔️ - 1.184s\n",
      "#68752 Lives and exploits of the most noted highwaymen, robbers and murderers of all nations\n",
      "Drawn from the most authentic sources and brought down to the present time (en) ✔️ - 1.300s\n",
      "#21364 The Rajah of Dah (en) ✔️ - 2.184s\n",
      "#9942 Lectures on Ten British Mathematicians of the Nineteenth Century (en) ❌ - 1.151s\n",
      "#69725 Elementary woodworking (en) ✔️ - 1.421s\n",
      "#51011 The Ghost Camp; or, the Avengers (en) ✔️ - 2.279s\n",
      "#15175 A Century of Wrong (en) ✔️ - 1.587s\n",
      "#11058 Jack Archer: A Tale of the Crimea (en) ✔️ - 1.530s\n",
      "#25017 A Son of the Immortals (en) ✔️ - 2.391s\n",
      "#15476 The Mahabharata of Krishna-Dwaipayana Vyasa, Volume 3\n",
      "Books 8, 9, 10, 11 and 12 (en) ❌ - 1.151s\n",
      "#39870 Bert Wilson, Marathon Winner (en) ✔️ - 3.770s\n",
      "#8066 The Bible, King James version, Book 66: Revelation (en) ❌ - 1.200s\n",
      "#28034 Scenes in the Hawaiian Islands and California (en) ✔️ - 1.033s\n",
      "#61513 The Phantom Death, etc. (en) ✔️ - 1.884s\n",
      "#6180 A Romany of the Snows, vol. 1\n",
      "Being a Continuation of the Personal Histories of \"Pierre and His People\" and the Last Existing Records of Pretty Pierre (en) ❌ - 0.718s\n",
      "#47252 Mother Earth's Children: The Frolics of the Fruits and Vegetables (en) ✔️ - 1.301s\n",
      "#48566 Barracks, Bivouacs and Battles (en) ✔️ - 3.749s\n",
      "#33425 The Crimson Sweater (en) ✔️ - 1.958s\n",
      "▶▶▶ 25 done (21 successful) out of 55128 ◀◀◀\n",
      "Done.\n"
     ]
    }
   ],
   "source": [
    "# NOTE: this will take really long depending on the number of ebooks selected\n",
    "for index, row in df.iterrows():\n",
    "    book = row[\"Text#\"]\n",
    "    if book not in crawled[\"book\"].values:\n",
    "        t = time.time()\n",
    "        print(f\"#{book} {row['Title']} ({row['Language']})\", end=\" \")\n",
    "        if gc.save(book):\n",
    "            print(\"✔️\", end=\" \")\n",
    "            crawled = crawled.append({\"book\": book, \"success\": True}, ignore_index=True)\n",
    "        else:\n",
    "            print(\"❌\", end=\" \")\n",
    "            crawled = crawled.append({\"book\": book, \"success\": False}, ignore_index=True)\n",
    "        print(f\"- {(time.time() - t):.3f}s\")\n",
    "        crawled.to_csv(STATUS, index=False)\n",
    "        if len(crawled) % 25 == 0:\n",
    "            print(f\"▶▶▶ {len(crawled)} done ({int(crawled['success'].sum()) } successful) out of {len(df)} ◀◀◀\")\n",
    "\n",
    "print(\"Done.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Add the crawled text files into parquet datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "21 out of 55128 (0.04%) available.\n"
     ]
    }
   ],
   "source": [
    "crawled = pd.read_csv(STATUS)\n",
    "crawled = crawled.loc[crawled[\"success\"] == True]\n",
    "crawled.rename(columns={\"book\": \"Text#\"}, inplace=True)\n",
    "\n",
    "gc = GutenbergCrawler(FOLDER)\n",
    "df = gc.catalog()\n",
    "df = df.loc[df[\"Language\"] == LANG]\n",
    "\n",
    "print(f\"{len(crawled)} out of {len(df)} ({len(crawled) / len(df) * 100.:.2f}%) available.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates(subset=[\"Text#\"], inplace=True)\n",
    "df = pd.merge(df, crawled, on=[\"Text#\"], how=\"inner\")\n",
    "assert not len(df.loc[df[\"success\"] == False])\n",
    "del crawled\n",
    "df.drop(columns=[\"Type\", \"Language\", \"success\"], inplace=True)\n",
    "df.sort_values(by=\"Text#\", ascending=True, inplace=True)\n",
    "len(df)  # number of items after merging with metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.45it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.30it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 33.12it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 15.04it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 27.61it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def read(file: str) -> Optional[str]:\n",
    "    result = None\n",
    "    if os.path.exists(file):\n",
    "        with open(file, \"r\", encoding=\"utf-8\") as f:\n",
    "            result = f.read()\n",
    "    return result\n",
    "\n",
    "\n",
    "def strip(value: Any) -> str:\n",
    "    return str(value).strip() if value and pd.notna(value) else \"\"\n",
    "\n",
    "\n",
    "for chunk in range(CHUNKS):\n",
    "    n = len(df) // CHUNKS\n",
    "    start, end = chunk * n, (chunk + 1) * n if chunk < CHUNKS - 1 else len(df)\n",
    "\n",
    "    updated = {col: [] for col in list(df.columns) + [\"Body\"]}\n",
    "    books = df[\"Text#\"].values[start:end]\n",
    "    for book in tqdm(books):\n",
    "        text = read(os.path.join(FOLDER, f\"{book}_text.txt\"))\n",
    "        text = gc.pretty(text)\n",
    "        if not text:\n",
    "            continue\n",
    "\n",
    "        df_row = df.loc[df[\"Text#\"] == book]\n",
    "        updated[\"Text#\"].append(book)\n",
    "        updated[\"Issued\"].append(pd.to_datetime(df_row[\"Issued\"].values[0], format=\"%Y-%m-%d\", errors=\"coerce\"))\n",
    "        updated[\"Title\"].append(strip(df_row[\"Title\"].values[0]))\n",
    "        updated[\"Authors\"].append(strip(df_row[\"Authors\"].values[0]))\n",
    "        updated[\"Subjects\"].append(strip(df_row[\"Subjects\"].values[0]))\n",
    "        updated[\"LoCC\"].append(strip(df_row[\"LoCC\"].values[0]))\n",
    "        updated[\"Bookshelves\"].append(strip(df_row[\"Bookshelves\"].values[0]))\n",
    "        updated[\"Body\"].append(text)\n",
    "\n",
    "    updated = pd.DataFrame(updated)\n",
    "    if CHUNKS == 1:\n",
    "        updated.to_parquet(f\"gutenberg_{LANG}_all.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\")\n",
    "    else:\n",
    "        updated.to_parquet(\n",
    "            f\"gutenberg_{LANG}_{chunk + 1}_of_{CHUNKS}.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\"\n",
    "        )\n",
    "    del updated\n",
    "\n",
    "print(\"Done.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
